# Loading Libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(modelr)
library(nycflights13)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(splines)
library(forcats)
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
The data is provided by data.gov.uk. It describes road accidents across Leeds in the year 2017. Between all datasets, this dataset was selected because it includes around 2,200 accident with 15 variable that can be easily parsed and explored. Also, we selected this dataset because we believe that there are different questions that can be asked about the data, and that the data can provide interesting answers for our questions. The dataset includes data such as location, date, time, road setting and casualty information for each accident that transpired during the year.
Reference : https://data.world/datagov-uk/6efe5505-941f-45bf-b576-4c1e09b579a1
# Loading & Parsing Data
Accidents_2017 <- read_csv("Data/datagov-uk-6efe5505-941f-45bf-b576-4c1e09b579a1/2017-8.csv",
col_names = c("Reference_Number","Easting","Northing","Vehicles_Num","Accident_Date","Time","Road_Class","Road_Surface",
"Lightning_Cond","Weather_Cond","Vehicle_Type","Casualty_Class","Severity","Gender","Age"), skip = 1)
##
## -- Column specification --------------------------------------------------------
## cols(
## Reference_Number = col_character(),
## Easting = col_double(),
## Northing = col_double(),
## Vehicles_Num = col_double(),
## Accident_Date = col_character(),
## Time = col_character(),
## Road_Class = col_character(),
## Road_Surface = col_character(),
## Lightning_Cond = col_character(),
## Weather_Cond = col_character(),
## Vehicle_Type = col_character(),
## Casualty_Class = col_character(),
## Severity = col_character(),
## Gender = col_character(),
## Age = col_double()
## )
Accidents_Date <- Accidents_2017$Accident_Date
Accidents_Date <- parse_date(Accidents_Date, "%m/%d/%Y")
Time <- Accidents_2017$Time
Time <- parse_time(Time, "%H%M")
Accidents_2017 <- Accidents_2017 %>%
select(1:4, 7:15) %>%
mutate(Accidents_Date, Time)
Accidents_2017 <- Accidents_2017[,c(1:4,14,15,5:13)]
# Correcting spelling mistakes
Lightning_Cond <-
str_replace(Accidents_2017$Lightning_Cond,"Darkness: Street lights present and lit and lit","Darkness: Street lights present and lit")
Road_Class <- str_replace_all(Accidents_2017$Road_Class,
c("A.*" = "A","B.*" = "B","M.*" = "M"))
Road_Surface <- str_replace(Accidents_2017$Road_Surface,"^F.*","Snow")
Weather_Cond <- word(Accidents_2017$Weather_Cond,1)
Weather_Cond <- str_replace(Weather_Cond,"Fog","Other")
Vehicle_Type <- word(Accidents_2017$Vehicle_Type,1)
Vehicle_Type <- str_replace_all(Vehicle_Type,
c(".Private" = "Taxi",
"Ca.*" = "Car","Pedal" = "Cycle"))
Vehicle_Type <- str_replace(Vehicle_Type,"TaxiTaxi", "Taxi")
# Adding the improved columns to the dataset
Accidents_2017 <- Accidents_2017 %>%
select(1:6,12:15) %>%
mutate(Lightning_Cond,Weather_Cond,Road_Class,Road_Surface,Vehicle_Type,
Year = year(Accidents_Date),
Month = month(Accidents_Date),
Day = day(Accidents_Date),
Hour = hour(Time),
Minute = minute(Time),
Accidents_DateTime = make_datetime(Year, Month, Day, Hour, Minute))
Accidents_2017 <- Accidents_2017[,c(1:4, 16:20, 5:6, 21, 7:15)]
# Assigning the columns to their proper classes.
for(i in 2:length(Accidents_2017)) {
if(is.character(Accidents_2017[[i]])) {
Accidents_2017[[i]] <- as.factor(Accidents_2017[[i]])
} else if (is.numeric(Accidents_2017[[i]])) {
Accidents_2017[[i]] <- as.numeric(Accidents_2017[[i]])
}
}
Accidents_2017
## # A tibble: 2,203 x 21
## Reference_Number Easting Northing Vehicles_Num Year Month Day Hour Minute
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3AP0313 426340 428455 1 2017 3 17 8 15
## 2 3BE0850 430828 433222 2 2017 1 14 13 30
## 3 4110858 428940 429856 2 2017 1 1 8 5
## 4 4110858 428940 429856 2 2017 1 1 8 5
## 5 4111495 429899 434277 2 2017 1 1 17 5
## 6 4111706 435946 436807 2 2017 1 1 12 0
## 7 4120471 443658 436768 3 2017 1 2 12 30
## 8 4120471 443658 436768 3 2017 1 2 12 30
## 9 4121054 442103 434572 2 2017 1 2 18 7
## 10 4121054 442103 434572 2 2017 1 2 18 7
## # ... with 2,193 more rows, and 12 more variables: Accidents_Date <date>,
## # Time <time>, Accidents_DateTime <dttm>, Casualty_Class <fct>,
## # Severity <fct>, Gender <fct>, Age <dbl>, Lightning_Cond <fct>,
## # Weather_Cond <fct>, Road_Class <fct>, Road_Surface <fct>,
## # Vehicle_Type <fct>
# Checking NAs
sum(rowSums(is.na(Accidents_2017)))
## [1] 0
# No NA values to explore
EDA1 <- Accidents_2017 %>%
group_by(Gender, Age) %>%
mutate(Count = n() ) %>%
ungroup() %>%
ggplot(mapping = aes(x=Age , y = Count, color = Severity)) +
geom_point(alpha = 0.6) +
facet_wrap(~Gender) +
ggtitle("Number of Casualties Per Age For Males and Females") +
ylab("Number of Casualties") +
theme_linedraw()
layout_plot <- function(my_plot, x = -0.057, y = - 0.033){
my_plot[['x']][['layout']][['annotations']][[1]][['y']] <- x
my_plot[['x']][['layout']][['annotations']][[2]][['x']] <- y
my_plot
}
ggplotly(EDA1) %>% layout_plot
The plot shows the relationship between the Number of Casualities and their Age. In addition, the plot is classified by the Gender, therefore; it shows how the relationship differs between males and females.
For females, as the age increases the count increases. The count peaks around the mid-20s, then the count declines gradually. The males’ graph has a similar shape as the females’. The only difference is that the males’ graph has a higher peak and the drop is sharper.
Accidents_2017 %>%
group_by(Gender, Age) %>%
summarise(Count = n() )
## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
## # A tibble: 182 x 3
## # Groups: Gender [2]
## Gender Age Count
## <fct> <dbl> <int>
## 1 Female 1 8
## 2 Female 2 3
## 3 Female 3 6
## 4 Female 4 2
## 5 Female 5 6
## 6 Female 6 7
## 7 Female 7 8
## 8 Female 8 5
## 9 Female 9 4
## 10 Female 10 9
## # ... with 172 more rows
EDA2 <- Accidents_2017 %>%
group_by(Casualty_Class, Age) %>%
summarise(Count = n() ) %>%
ungroup() %>%
ggplot(mapping = aes(x=Age , y = Count)) +
geom_point(alpha = 0.6) +
geom_smooth()+
facet_wrap(~Casualty_Class) +
ggtitle("Count against Age based on Casualty class") +
ylab("Number of Casualties") +
theme_linedraw()
## `summarise()` has grouped output by 'Casualty_Class'. You can override using the `.groups` argument.
ggplotly(EDA2) %>% layout_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Accidents_2017 %>%
group_by(Casualty_Class,Age) %>%
summarise(Count = n() )
## `summarise()` has grouped output by 'Casualty_Class'. You can override using the `.groups` argument.
## # A tibble: 260 x 3
## # Groups: Casualty_Class [3]
## Casualty_Class Age Count
## <fct> <dbl> <int>
## 1 Driver or rider 4 1
## 2 Driver or rider 6 2
## 3 Driver or rider 7 1
## 4 Driver or rider 8 3
## 5 Driver or rider 9 1
## 6 Driver or rider 10 3
## 7 Driver or rider 11 4
## 8 Driver or rider 12 5
## 9 Driver or rider 13 5
## 10 Driver or rider 14 3
## # ... with 250 more rows
Accidents_2017 %>%
group_by(Casualty_Class) %>%
summarise(Count = n() )
## # A tibble: 3 x 2
## Casualty_Class Count
## <fct> <int>
## 1 Driver or rider 1296
## 2 Pedestrian 321
## 3 Vehicle or pillion passenger 586
# Distribution of Accidents across the course of the day for every day of the year
Accidents_2017 %>%
mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1)) %>%
ggplot(mapping = aes(Accidents_by_Hour)) +
geom_freqpoly(binwidth = 3600) # 5 mins interval
# Distribution of Accidents across the course of the day for every day (and month) of the year
Accidents_2017 %>%
mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1),
Accidents_by_Month = factor(Month)) %>%
ggplot(mapping = aes(Accidents_by_Hour, color = Accidents_by_Month)) +
geom_freqpoly(binwidth = 3600) # 1 hour interval
# Distribution of Accidents across the course of the day for every day (and month) of the year w/ y-axis as density
Accidents_2017 %>%
mutate(Accidents_by_Hour = update(Accidents_DateTime, yday = 1),
Accidents_by_Month = factor(Month)) %>%
ggplot(mapping = aes(Accidents_by_Hour, color = Accidents_by_Month)) +
geom_freqpoly(mapping = aes(y = ..density..), binwidth = 3600) # 1 hour interval
daily <- Accidents_2017 %>%
group_by(Accidents_Date) %>%
summarise(n = n()) %>%
mutate(wday = wday(Accidents_Date, label = T, abbr = F))
daily
## # A tibble: 360 x 3
## Accidents_Date n wday
## <date> <int> <ord>
## 1 2017-01-01 4 Sunday
## 2 2017-01-02 7 Monday
## 3 2017-01-03 3 Tuesday
## 4 2017-01-04 7 Wednesday
## 5 2017-01-05 4 Thursday
## 6 2017-01-06 5 Friday
## 7 2017-01-07 5 Saturday
## 8 2017-01-09 5 Monday
## 9 2017-01-10 9 Tuesday
## 10 2017-01-11 7 Wednesday
## # ... with 350 more rows
# Number of flights per day
ggplot(daily, mapping = aes(Accidents_Date, n)) +
geom_line()
# Number of flights in each day of the week
ggplot(daily, mapping = aes(wday, n)) +
geom_boxplot()
ggplot(daily, mapping = aes(wday, n)) +
geom_bar(stat = "identity")
# Fitting a linear model w/ predictions & residuals
model1 <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(model1, "n")
ggplot(daily, mapping = aes(wday, n))+
geom_boxplot() +
geom_point(data = grid, colour = "red", size = 4)
daily <- daily %>%
add_residuals(model1)
ggplot(data = daily, mapping = aes(Accidents_Date, resid)) +
geom_ref_line(h = 0) +
geom_line()
# Let's started by counting the total number of accidents for each hour and visualizing it with ggplot2
accident_hour <- Accidents_2017 %>%
count(Hour)
accident_hour
## # A tibble: 24 x 2
## Hour n
## <dbl> <int>
## 1 0 30
## 2 1 23
## 3 2 9
## 4 3 15
## 5 4 16
## 6 5 12
## 7 6 41
## 8 7 85
## 9 8 143
## 10 9 105
## # ... with 14 more rows
ggplot(accident_hour,aes(Hour,n))+
geom_line()+
geom_point()
# Let's try to model the relationship between hours and count.
mod1 <- lm(n ~ ns(Hour,1) ,data = accident_hour)
mod2 <- lm(n ~ ns(Hour,2) ,data = accident_hour)
mod3 <- lm(n ~ ns(Hour,3) ,data = accident_hour)
mod4 <- lm(n ~ ns(Hour,4) ,data = accident_hour)
mod5 <- lm(n ~ ns(Hour,5) ,data = accident_hour)
mod6 <- lm(n ~ ns(Hour,6) ,data = accident_hour)
grid <- accident_hour %>%
data_grid(Hour = seq_range(Hour,n = 50,expand = 0.1)) %>%
gather_predictions(mod1,mod2,mod3,mod4,mod5,mod6,.pred = "Y")
grid
## # A tibble: 300 x 3
## model Hour Y
## <chr> <dbl> <dbl>
## 1 mod1 -1.15 34.6
## 2 mod1 -0.634 37.0
## 3 mod1 -0.117 39.3
## 4 mod1 0.399 41.6
## 5 mod1 0.915 44.0
## 6 mod1 1.43 46.3
## 7 mod1 1.95 48.6
## 8 mod1 2.46 51.0
## 9 mod1 2.98 53.3
## 10 mod1 3.50 55.6
## # ... with 290 more rows
ggplot(accident_hour,aes(Hour,n))+
geom_point()+
geom_smooth(data = grid,aes(y = Y),colour = "red")+
facet_wrap(~ model)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
accident_hour <- accident_hour %>%
gather_residuals(mod1,mod2,mod3,mod4,mod5,mod6)
ggplot(accident_hour,aes(Hour,resid)) +
geom_point()+ geom_ref_line(h = 0)+ geom_smooth()+
facet_wrap(~model)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# It seems that as we increase the degrees of freedom, the better our model gets. We will stick with 6 degrees of freedom for the rest of the analysis.
# Now we will add a day of the week variable (dow) and visualize the relationship between count and hour for each day of the week.
dow_hour <- Accidents_2017 %>%
mutate(dow = wday(Accidents_DateTime,label =TRUE))%>%
count(dow,Hour)
dow_hour
## # A tibble: 158 x 3
## dow Hour n
## <ord> <dbl> <int>
## 1 Sun 0 14
## 2 Sun 1 6
## 3 Sun 2 2
## 4 Sun 3 2
## 5 Sun 4 6
## 6 Sun 6 5
## 7 Sun 7 1
## 8 Sun 8 8
## 9 Sun 9 6
## 10 Sun 10 5
## # ... with 148 more rows
ggplot(dow_hour,aes(Hour,n,colour = dow))+
geom_line()+
geom_point()
# From the graph we notice that Sunday and Saturday have a different shape from the rest of the days. Both have a low count between 6 and 9 am compared to other days, Saturday peaks at an earlier time compared to other days. Moreover, Sunday has a lower count between 4 and 7 pm. The rest of the days have a similar shapes and follow the same trends. I think this is because the Sunday and Saturday are weekend days.
#Lets use a model to understand the relationship better.
mod <- lm(n ~ ns(Hour,6) ,data = dow_hour)
mod1 <- lm(n ~ ns(Hour,6) + dow,data = dow_hour)
mod2 <- lm(n ~ ns(Hour,6) * dow,data = dow_hour)
grid <- dow_hour %>%
data_grid(Hour = seq_range(Hour,n = 50,expand = 0.1),dow) %>%
gather_predictions(hour_alone = mod,plus_dow = mod1,multi_dow = mod2,.pred = "Y")
grid
## # A tibble: 1,050 x 4
## model Hour dow Y
## <chr> <dbl> <ord> <dbl>
## 1 hour_alone -1.15 Sun 8.12
## 2 hour_alone -1.15 Mon 8.12
## 3 hour_alone -1.15 Tue 8.12
## 4 hour_alone -1.15 Wed 8.12
## 5 hour_alone -1.15 Thu 8.12
## 6 hour_alone -1.15 Fri 8.12
## 7 hour_alone -1.15 Sat 8.12
## 8 hour_alone -0.634 Sun 6.89
## 9 hour_alone -0.634 Mon 6.89
## 10 hour_alone -0.634 Tue 6.89
## # ... with 1,040 more rows
ggplot(dow_hour,aes(Hour,n))+
geom_point()+
geom_smooth(data = grid,aes(y = Y),colour = "red")+
facet_wrap(~ model)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
dow_hour <- dow_hour %>%
gather_residuals(hour_alone = mod,plus_dow = mod1,multi_dow = mod2)
ggplot(dow_hour,aes(Hour,resid)) +
geom_point()+ geom_ref_line(h = 0)+ geom_smooth()+
facet_wrap(~model)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# It looks like mod2 (Hour * day of the week) gives us the plot with the best residuals. Only 5 points from 168 points are outside a distance of 10 residuals.
dow_hour %>%
filter(model == "multi_dow") %>%
ggplot(aes(Hour,resid,colour = dow))+
geom_ref_line(h = 0) +
geom_point()